import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import scipy
# import scipy
import matplotlib as mlt
import os
import json
import csv
spotify= pd.read_csv('SpotifyFeatures.csv')
spotify.head()
spotify.dtypes
spotify.isnull().sum(axis = 0)
spotify.describe()
spotify['genre'].value_counts()
spotify['genre'].unique().tolist()
spotify['genre'] = spotify['genre'].replace(['Children’s Music'],"Children's Music")
spotify['genre'] = spotify['genre'].replace(['Reggaeton'],"Reggae")
spotify['genre'].unique().tolist()
# check if there's duplicated
spotify.duplicated(keep='first').sum()
sns.set(rc={'figure.figsize':(20,8)})
sns.histplot(data=spotify, x="popularity", color="#1F3552", label="popularity", kde=True)
plt.axvline(x=spotify['popularity'].median(),
color='skyblue')
plt.ylabel('Density')
plt.title('Density distribution for popularity')
plt.legend()
plt.show()
sns.histplot(spotify['popularity']+1,bins=20)
plt.axvline(x=spotify['popularity'].median(),
color='yellow')
import plotnine
from plotnine import *
from plotnine import data
p10 = (
ggplot(spotify, aes("genre", "popularity"))
+ geom_boxplot(
colour="#1F3552",
fill="#4271AE",
alpha=0.7,
outlier_shape=".",
outlier_colour="steelblue",
)
+ xlab("genre")
+ ylab("Popularity")
+ scale_y_continuous(limits=[0, 100])
+ ggtitle("Song Popularity by Genre")
+ theme(figure_size=(24, 8))
)
p10
sns.set(rc={'figure.figsize':(20,8)})
ax = sns.violinplot(x="genre", y="popularity",
data=spotify, palette="muted")
ax.set_title("violin plot for Popularity by Genre", size=20)
ax.set_xlabel("Genre", labelpad=20, weight='bold', size=15)
ax.set_ylabel("Popularity", labelpad=20, weight='bold', size=15)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
sp1 = spotify.groupby("genre").mean()
sp1.sort_values(by='popularity', ascending=False, inplace=True)
mlt.style.use('fivethirtyeight')
# Average Popularity
ax=sp1['popularity'].plot(kind='bar', figsize=(14, 8), color='slategrey', zorder=2, width=0.7)
ax.set_title("Average Popularity by Genre")
ax.set_xlabel("Genre", labelpad=20, weight='bold', size=10)
ax.set_ylabel("Average Popularity by Genre", labelpad=20, weight='bold', size=12)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 40)
for index,data in enumerate(sp1.popularity.round(0).tolist()):
plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=10))
spotify.hist(column='popularity', by='genre', color='steelblue', figsize=[20, 16])
pop=spotify[spotify['genre']=='Pop']
sp2 = pop.groupby("artist_name").mean()
sp2.sort_values(by='popularity', ascending=False, inplace=True)
sns.histplot(pop['popularity']+1,bins=20)
plt.axvline(x=pop['popularity'].median(),
color='yellow')
m=spotify['popularity'].median()
spotify['popularity_level']=spotify['popularity'].apply(lambda x: 'popular' if x >= m else 'not popular')
spotify['popularity_level']
sns.set(rc={'figure.figsize':(10,8)})
sns.barplot(x = 'time_signature', y = 'popularity', data = spotify)
plt.title('Popularity Based on Time Signature')
sns.barplot(x = 'key', y = 'popularity', data = spotify)
plt.title('Popularity Based on Key')
sns.barplot(x = 'mode', y = 'popularity', data = spotify)
plt.title('Popularity Based on Mode')
sns.pairplot(data=spotify, vars=['acousticness','danceability','duration_ms','energy','instrumentalness','liveness',
'loudness','speechiness','tempo','valence'],
hue= 'popularity_level',
markers=["s", "d"] )
from sklearn import preprocessing
key_encoder = preprocessing.LabelEncoder()
spotify["key"] = key_encoder.fit_transform(spotify["key"])
mode_encoder = preprocessing.LabelEncoder()
spotify["mode"] = mode_encoder.fit_transform(spotify["mode"])
time_signature_encoder= preprocessing.LabelEncoder()
spotify["time_signature"] = mode_encoder.fit_transform(spotify["time_signature"])
spotify.head()
import statsmodels.api as sm
import statsmodels.formula.api as smf
model = smf.ols( formula='popularity ~ acousticness + danceability + duration_ms + energy + instrumentalness + liveness + loudness + speechiness + tempo +valence ', data=spotify).fit()
model.summary()
spotify['popularity_res'] = model.resid
ax2 = spotify.plot.scatter(x='liveness',
y='popularity_res')
ax1 = spotify.plot.scatter(x='acousticness',
y='popularity_res')
spotify_norm = spotify.copy()
#spotify_norm = spotify_norm.drop(['key', 'mode','time_signature','popularity_level','popularity_res'], axis = 1)
spotify_norm
import sklearn.preprocessing as skp
features = ['acousticness', 'danceability', 'duration_ms', 'energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']
#x = spotify_norm.loc[:, features].values
spotify_norm.loc[:,features] = skp.scale(spotify_norm.loc[:,features].astype(np.float))
spotify_norm.head()
spotify_norm.describe()
sns.clustermap(spotify_norm.corr(),cmap=plt.cm.OrRd)
#sns.pairplot(spotify_norm)
import sklearn.decomposition as skd
features = ['acousticness', 'danceability', 'duration_ms', 'energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']
x = spotify_norm.loc[:, features].values
pca_model = skd.PCA().fit(x)
plt.plot(range(1,11),pca_model.explained_variance_,'b-o')
plt.ylabel('varaince')
plt.title('PCA')
pca = skd.PCA(n_components=4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2','pc3','pc4'])
finalDf = pd.concat([principalDf, spotify[['popularity']]], axis = 1)
model2 = smf.ols( formula='popularity ~ pc1 + pc2+ pc3 +pc4', data = finalDf).fit()
model2.summary()
finalDf.isnull().value_counts()
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
spotify2= spotify.sample(frac = 0.1)
spotify2.shape
df_kfeature= spotify2[['popularity','acousticness','danceability','duration_ms','energy','instrumentalness',
'liveness','loudness','speechiness','tempo','valence'
]]
data =df_kfeature.copy()
import sklearn.metrics as metrics
cost = []
for k in range(2,10):
k_means = KMeans(init='k-means++', n_clusters=k, n_init=10)
k_means.fit(data)
cost.append(metrics.silhouette_score(data, k_means.labels_))
# kIdx = np.argmax(cost)
fig, ax = plt.subplots()
plt.plot(range(2,10), cost, 'b*-')
plt.xlim(1, plt.xlim()[1])
plt.ylabel('Silhouette Score')
plt.xlabel('# of k means')
plt.title('Silhouette Score')
kmeans = KMeans(n_clusters=2) # start with 3 clusters
kmeans.fit(df_kfeature)
print("Labels:")
print(kmeans.labels_)
data_joined = pd.concat([pd.DataFrame(kmeans.labels_).reset_index(),df_kfeature.reset_index()],axis=1).drop('index',axis=1)
data_joined.head()
data_joined.rename(columns={0: "clusters"}, inplace=True)
data_joined['clusters'].value_counts()
plt.title('Distribution of Popularity')
# sns.distplot(ltv['ltv'])
fig = plt.subplot()
sns.distplot(data_joined[data_joined['clusters']==0]['popularity'],color ='red')
sns.distplot(data_joined[data_joined['clusters']==1]['popularity'],color ='blue')
#sns.distplot(data_joined[data_joined['clusters']==2]['data_joined'],color ='green')
fig.legend(labels=['0','1'])
plt.show()
plt.title('Distribution of danceability')
# sns.distplot(ltv['ltv'])
fig = plt.subplot()
sns.distplot(data_joined[data_joined['clusters']==0]['danceability'],color ='red')
sns.distplot(data_joined[data_joined['clusters']==1]['danceability'],color ='blue')
#sns.distplot(data_joined[data_joined['clusters']==2]['data_joined'],color ='green')
fig.legend(labels=['0','1'])
plt.show()
plt.title('Distribution of energy')
# sns.distplot(ltv['ltv'])
fig = plt.subplot()
sns.distplot(data_joined[data_joined['clusters']==0]['energy'],color ='red')
sns.distplot(data_joined[data_joined['clusters']==1]['energy'],color ='blue')
#sns.distplot(data_joined[data_joined['clusters']==2]['data_joined'],color ='green')
fig.legend(labels=['0','1'])
plt.show()